%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

from sklearn.decomposition import PCA
from numpy import linalg as LA

#!pip install gensim

from gensim.test.utils import datapath

import gensim.downloader as api
wv = api.load('glove-wiki-gigaword-300')

len(wv)

400000

'utrecht' in wv

True

wv["utrecht"]

array([ 1.7246e-01, -9.2534e-02,  1.6428e-01,  5.6065e-01,  2.8004e-01,
       -1.5202e-02,  2.7825e-01, -6.5038e-01,  6.6932e-03,  1.0561e-01,
        7.0731e-01,  3.0788e-02,  1.6741e-02, -5.1297e-02,  3.9779e-02,
        5.6926e-01, -2.2903e-01,  3.7331e-01,  3.4440e-01,  2.1143e-01,
        1.9787e-01, -2.5727e-01,  2.0160e-01, -2.5648e-02, -4.7996e-01,
       -1.4263e-02, -5.5923e-01,  1.0823e-01, -9.0213e-01,  1.9254e-01,
        4.6962e-01, -4.2952e-01,  8.1015e-01,  1.1222e+00,  2.8237e-01,
       -6.0666e-01,  2.8948e-01,  3.1003e-01, -4.3745e-01, -1.0235e-01,
        3.1119e-01, -3.2637e-02, -5.5157e-01, -5.0208e-01,  4.2994e-01,
        3.2319e-01,  4.6997e-01,  7.5685e-02, -3.1806e-02,  3.5623e-01,
        5.9764e-02,  2.4482e-01,  1.0263e-03,  4.6040e-01,  1.5376e-01,
       -2.7841e-01,  2.6469e-02, -1.0785e-01, -4.5553e-01,  2.3964e-01,
       -4.4707e-01,  1.1521e-01,  2.1919e-01,  4.9167e-01,  3.4367e-01,
       -1.2389e-01,  3.1069e-01,  3.7095e-01,  3.8991e-04, -4.6645e-01,
        8.0286e-02,  1.8450e-01,  1.7059e-01, -4.2067e-01, -4.6711e-01,
        6.6363e-01, -1.6728e-01, -1.2503e-01,  1.9912e-01, -8.2380e-02,
       -1.6341e-01, -3.6798e-01, -1.2205e-01, -5.8729e-02, -6.8966e-01,
       -6.4324e-01,  3.2243e-01,  1.3532e-01, -6.3114e-01,  3.8847e-01,
        1.9127e-02, -9.1993e-01, -5.7550e-02, -4.5361e-01,  3.2246e-01,
       -2.3290e-01,  3.6797e-01, -5.0768e-01, -6.1523e-01, -2.7449e-02,
        9.9997e-02, -8.7165e-01,  7.3179e-01,  2.0256e-01, -9.5692e-02,
       -1.4106e-02,  3.4639e-01,  6.0384e-01, -2.1368e-01,  4.5634e-01,
        3.0276e-01, -2.1606e-01, -5.1111e-02, -8.0779e-02,  5.8191e-02,
       -3.4681e-01,  1.4722e-01, -9.8759e-02, -2.3481e-01, -8.1633e-02,
       -5.1594e-01, -2.9282e-01,  1.0665e-01, -3.3598e-02,  3.9112e-02,
        6.1789e-02, -2.7619e-03,  1.8009e-01,  1.1321e-01,  1.2848e-01,
        5.7384e-01,  2.0626e-01, -1.1215e-02,  5.8909e-01,  8.3938e-01,
       -2.5339e-01,  4.7566e-01, -5.3437e-01,  7.6082e-02,  1.8165e-01,
        5.7592e-01,  1.6382e-01, -2.9846e-02, -2.5505e-01,  2.9101e-01,
       -2.4089e-01,  1.1434e-01,  3.5274e-01, -4.2624e-01,  2.4286e-02,
        4.4413e-02, -4.6485e-02,  1.9847e-01,  7.0653e-01, -1.7389e-02,
        9.8610e-01, -2.3929e-01,  8.0001e-01, -3.6342e-01,  1.1249e-01,
       -3.0034e-01,  9.2404e-02, -2.5635e-01, -1.9328e-01, -1.2457e-01,
        2.5057e-01, -4.5034e-01, -1.0178e-01,  2.4003e-01, -2.0324e-01,
        3.9938e-02, -1.2958e-01, -3.9431e-01, -2.4845e-01, -4.1129e-03,
       -3.4170e-03,  3.8176e-01,  2.1570e-01, -7.4349e-01, -3.9406e-03,
       -2.1713e-01, -3.9769e-01, -2.8074e-01, -1.7159e-01,  8.7543e-01,
       -4.8529e-01, -7.8966e-01, -1.9181e-01,  1.2654e-02,  2.7394e-01,
        4.7824e-01, -1.3372e-01,  6.9161e-01, -2.8090e-01,  1.7082e-01,
       -6.2932e-01, -7.0490e-01, -1.6360e-01,  3.1653e-01,  3.1181e-01,
       -2.5984e-01, -7.1512e-02, -3.5759e-01, -1.2873e-01, -1.0136e-01,
       -8.4309e-01,  1.2680e-01,  2.1268e-01, -2.8869e-01,  9.0773e-02,
       -2.5738e-01, -1.8303e-01, -1.2904e-01,  4.4565e-01, -5.8736e-01,
        7.2059e-01, -3.3435e-01,  1.5862e-01, -9.1833e-02, -5.7568e-01,
        3.3082e-01, -1.8949e-01, -4.1058e-01,  2.8824e-01, -3.7240e-01,
       -3.2524e-01, -5.9380e-01, -1.9471e-01, -2.3064e-01,  5.9878e-02,
       -4.2104e-03,  1.2105e-01,  4.9247e-01, -1.9579e-01, -1.0380e-01,
       -2.4607e-01,  7.2958e-01,  1.6426e-02, -8.1131e-02, -3.9744e-02,
       -2.0024e-01, -1.0811e-01,  5.1185e-01,  1.2625e-01, -1.0223e-01,
        4.4235e-01, -5.3241e-01, -8.0116e-02,  5.3696e-01, -6.7007e-01,
        1.0677e-01, -6.4972e-02,  2.8323e-01, -1.1779e-01, -2.0966e-01,
        1.6076e-01,  7.3603e-01, -4.5484e-01, -4.4733e-01, -1.2544e-02,
        3.2596e-01,  2.9868e-01,  7.2416e-01,  8.8062e-01,  6.8829e-01,
        2.2420e-01, -5.9788e-01, -3.2725e-01,  2.0964e-01, -5.4107e-01,
       -4.6910e-01, -7.1801e-01, -1.5257e-01,  1.1137e-01,  4.0723e-01,
        3.6233e-01,  3.3883e-01, -6.8084e-01, -1.4319e-01,  8.5794e-01,
       -4.4418e-01,  5.6938e-01, -1.9381e-01, -8.5069e-01, -2.1100e-01,
        1.1303e-01, -3.8039e-01,  6.2520e-01, -4.8425e-02,  4.5813e-02,
       -3.5546e-01,  1.8457e-01,  2.1444e-02, -2.9848e-01,  4.2873e-01,
       -1.7749e-01,  5.0280e-01, -1.7376e-01,  4.2369e-01, -1.5048e-03],
      dtype=float32)

wv["utrecht"].shape

(300,)

wv.similarity('university', 'student')

0.5970514

wv_university_norm = wv['university']/ LA.norm(wv['university'], 2)
wv_student_norm = wv['student'] / LA.norm(wv['student'], 2)

wv_university_norm.dot(wv_student_norm)

0.5970514

LA.norm(wv_student_norm)

1.0

print(wv.most_similar(positive=['car'], topn=5))

[('cars', 0.7827162146568298), ('vehicle', 0.7655367851257324), ('truck', 0.7350621819496155), ('driver', 0.7114784717559814), ('driving', 0.6442225575447083)]

wv.similarity('buy', 'purchase')

0.77922326

wv.similarity('cat', 'dog')

0.68167466

wv.similarity('car', 'green')

0.25130013

wv.evaluate_word_pairs(datapath('wordsim353.tsv'))

((0.6040760940127656, 1.752303459427209e-36),
 SpearmanrResult(correlation=0.6085349998820805, pvalue=3.879629536780527e-37),
 0.0)

 wv.most_similar(negative=['man'], positive=['king', 'woman'])

[('queen', 0.6713277101516724),
 ('princess', 0.5432624816894531),
 ('throne', 0.5386103987693787),
 ('monarch', 0.5347574949264526),
 ('daughter', 0.49802514910697937),
 ('mother', 0.49564430117607117),
 ('elizabeth', 0.4832652509212494),
 ('kingdom', 0.47747090458869934),
 ('prince', 0.4668239951133728),
 ('wife', 0.46473270654678345)]

 wv.most_similar(negative=['paris'], positive=['france', 'amsterdam'])

[('netherlands', 0.7304360866546631),
 ('dutch', 0.5829049944877625),
 ('belgium', 0.5607961416244507),
 ('holland', 0.5492807626724243),
 ('denmark', 0.5330449938774109),
 ('sweden', 0.4875030517578125),
 ('germany', 0.4710354804992676),
 ('utrecht', 0.46798408031463623),
 ('spain', 0.46100151538848877),
 ('rotterdam', 0.45599010586738586)]

print(wv.most_similar(positive=['amsterdam'], topn=5))

[('rotterdam', 0.6485881209373474), ('schiphol', 0.5740087032318115), ('utrecht', 0.5608800053596497), ('netherlands', 0.5472348928451538), ('frankfurt', 0.5457332730293274)]

wv.most_similar(negative=['cat'], positive=['cats', 'girl'])

[('girls', 0.6908110976219177),
 ('boys', 0.6055023074150085),
 ('boy', 0.5850904583930969),
 ('teenage', 0.5735769867897034),
 ('teenagers', 0.5600976943969727),
 ('teenager', 0.5530085563659668),
 ('teen', 0.5423317551612854),
 ('children', 0.5312655568122864),
 ('woman', 0.5233089327812195),
 ('babies', 0.5163544416427612)]

print(wv.most_similar(positive=['girl'], topn=5))

[('boy', 0.8272891044616699), ('woman', 0.729641854763031), ('girls', 0.7227292060852051), ('teenager', 0.6509774327278137), ('teenage', 0.6492719054222107)]

def display_scatterplot(wv, words=None, sample=0):

    # first get the word vectors
    word_vectors = np.array([wv[w] for w in words])

    # transform the data using PCA
    wv_PCA = PCA().fit_transform(word_vectors)[:,:2]

    plt.figure(figsize=(10,10))

    plt.scatter(wv_PCA[:,0], wv_PCA[:,1],
                edgecolors='k', c='r')

    for word, (x,y) in zip(words, wv_PCA):
        plt.text(x+0.05, y+0.05, word)

display_scatterplot(wv,
                        ['dog', 'cat', 'dogs', 'cats', 'horse', 'tiger',
                         'university', 'lesson', 'student', 'students',
                         'netherlands', 'amsterdam', 'utrecht', 'belgium', 'spain', 'china',
                         'coffee', 'tea', 'pizza', 'sushi', 'sandwich',
                         'car', 'train', 'bike', 'bicycle', 'trains'])

display_scatterplot(wv,
                        ['he', 'she', 'sister',
                         'brother', 'man', 'woman',
                         'nurse', 'doctor',
                         'grandfather', 'grandmother',
                         'math', 'arts',
                         'daughter', 'son'])

def calc_avg_similiarity(wv, attribute_words, target_word):
    score = 0

    for attribute_word in attribute_words:
            score += wv.similarity(attribute_word, target_word)

    return score/len(attribute_words)

# set of attribute words
attribute_words_m = ['male', 'man', 'boy', 'brother', 'he', 'him', 'his', 'son']
attribute_words_f = ['female', 'woman', 'girl', 'sister', 'she',
                     'her', 'hers', 'daughter']

print("Avg. similarity with male words: %.3f" %
      calc_avg_similiarity(wv, attribute_words_m, 'math'))
print("Avg. similarity with female words: %.3f" %
      calc_avg_similiarity(wv, attribute_words_f, 'math'))

Avg. similarity with male words: 0.118
Avg. similarity with female words: 0.105

print("Avg. similarity with male words: %.3f" %
      calc_avg_similiarity(wv, attribute_words_m, 'poetry'))
print("Avg. similarity with female words: %.3f" %
      calc_avg_similiarity(wv, attribute_words_f, 'poetry'))

Avg. similarity with male words: 0.166
Avg. similarity with female words: 0.185

wv_f = api.load('fasttext-wiki-news-subwords-300')

print(wv_f.similarity("dog", "dogs"))

0.8457405

Practical 6: Word Embedding¶

Text Mining, Transforming Text into Knowledge (202400006)¶

Installation¶

First install the gensim library¶

Reading in a pre-trained model¶

Exploring the vocabulary¶

Vector arithmethics¶

Similarity analysis¶

Analogies¶

Visualization¶

Biases¶

Next¶

FastText¶